#!/bin/bash

# Use and modify if using HPC cluster
#SBATCH --partition=priority
#SBATCH	--mem=128G
#SBATCH	-c 14
#SBATCH	--time=1-00:00
#SBATCH	--mail-type=all
#SBATCH	--mail-user=john_doe@XXXX.edu 		#specify
#SBATCH -o ../logfiles/%j.out
#SBATCH -e ../logfiles/%j.err

# load modules

module load gcc bowtie2 samtools blat java/jdk-1.8u112
source activate /home/alt355/.conda/envs/mpra

# Variables
PROJECTID="${1}"
READ1="${2}1.fastq.gz"
READ2="${2}2.fastq.gz"
REF="${3}"
GENOME="mm10.fa.gz"				#download from UCSC database, and/or modify accordingly
CHROM="mm10.chrom.sizes"			#download from UCSC database, and/or modify accordingly
WHITELIST="output_generated_whitelist.txt"
LIBRARY_KEY="GCCTGTAT"                    	#Update library tag so that this matches each respective library 

# Specify paths
READ1="../FASTQ/${1}-mp_f-bc/${READ1}"          #complete path to Read1 including file
READ2="../FASTQ/${1}-mp_f-bc/${READ2}"          #complete path to Read2 including file
OUTPUTDIR="../Assoc/output/${PROJECTID}"      	#complete path to output directory, will be created
REF="../reference/${REF}"               	#Complete path to BAC Fasta file
GENOME="../reference/${GENOME}"         	#Complete path to Reference Genome File in fasta format
CHROM="../reference/${CHROM}"            	#complete path to UCSC chrom sizes file
BBMAPpath="/home/alt355/utils/bbmap"		#complete path to BBMAP
Rcode="Overlapping_Fragments_Collapse.R"	#complete path to R script
WHITELISTFILE="../reference/${WHITELIST}"	#complete path to WHITELIST
CORES=$SLURM_CPUS_PER_TASK			#for HPC cluster, assigns number of cores 

# Export bbduk dir to path
export PATH="$BBMAPpath:$PATH"


# Create Directories
mkdir -p "${OUTPUTDIR}"
mkdir -p "${OUTPUTDIR}/Ref"
mkdir -p "${OUTPUTDIR}/BC_Extract"
mkdir -p "${OUTPUTDIR}/Temp"
mkdir -p "${OUTPUTDIR}/Clean"
mkdir -p "${OUTPUTDIR}/Final"


# Align Reference to Genome using Minimap2
echo
echo "running BAC To Genome Mapping"
echo

minimap2 -c -x asm20 $GENOME $REF > ${OUTPUTDIR}/Ref/BAC_alignment.paf 2>> "${OUTPUTDIR}/logfile.log"

if [ $? -ne 0 ]; then
    echo "Error in step: <step-name>. Exiting."
    exit 1
fi

#Modify the alignment to include 1000bp before and after the alignments of the BAC to the genome
awk 'BEGIN {OFS="\t"} {start=($8-1000 < 0 ? 0 : $8-1000); print $6, start, $9+1000, $1}' ${OUTPUTDIR}/Ref/BAC_alignment.paf > ${OUTPUTDIR}/Ref/Align.bed

#Position sort the bed file
sort -k1,1 -k2,2n ${OUTPUTDIR}/Ref/Align.bed > ${OUTPUTDIR}/Ref/Align_Sort.bed

#Merge alignments within 100bp of eachother to create a final Reference bed file.
bedtools merge -i ${OUTPUTDIR}/Ref/Align_Sort.bed -d 100 > ${OUTPUTDIR}/Ref/Ref.bed

#Correct the Rbed file to accomodate the 1-based indexing of seqkit subseq command
awk 'BEGIN {OFS="\t"} {print $1, $2 + 1, $3}' ${OUTPUTDIR}/Ref/Ref.bed > ${OUTPUTDIR}/Ref/Corrected_Ref.bed
seqkit subseq --bed "${OUTPUTDIR}/Ref/Corrected_Ref.bed" $GENOME > "${OUTPUTDIR}/Ref/Subseq.fa"


# Get subsequence from genome
echo
echo "running seqkit"
echo
seqkit subseq --bed "${OUTPUTDIR}/Ref/Ref.bed" $GENOME > "${OUTPUTDIR}/Ref/Subseq.fa"

# Build Bowtie2 Reference from REF path
echo
echo "running bowtie2"
echo
bowtie2-build "${OUTPUTDIR}/Ref/Subseq.fa" "${OUTPUTDIR}/Ref/Bowtie2_Ref"

if [ $? -ne 0 ]; then
    echo "Error in step: Build Bowtie2 Reference. Exiting."
    exit 1
fi

# Extract BC
echo
echo "running umi_tools"
echo
umi_tools extract -p NNNNNNNNNNNNNNNNNNNNNNNNCCCCCCCC -I $READ1 -S "${OUTPUTDIR}/BC_Extract/BC_Extract_R1.fastq.gz" \
          --read2-in=$READ2 --read2-out="${OUTPUTDIR}/BC_Extract/BC_Extract_R2.fastq.gz"

if [ $? -ne 0 ]; then
    echo "Error in step: Extract BC. Exiting."
    exit 1
fi

# Adapter Trim using bbduk (change the ref path to your bbduk install)
echo
echo "running bbduk"
echo
bbduk.sh in1="${OUTPUTDIR}/BC_Extract/BC_Extract_R1.fastq.gz" \
         in2="${OUTPUTDIR}/BC_Extract/BC_Extract_R2.fastq.gz" \
         out1="${OUTPUTDIR}/Clean/Clean_R1.fastq.gz" \
         out2="${OUTPUTDIR}/Clean/Clean_R2.fastq.gz" \
         ref="${BBMAPpath}/resources/adapters.fa" ktrim=r k=23 mink=11 hdist=1 tpe tbo

# Map Trimmed Reads (Change -X flag if you want to retain larger fragments- it's in BP.)
echo
echo "running bowtie2"
echo
bowtie2 -p $CORES -X 800 --very-sensitive-local --dovetail --no-mixed --no-discordant \
        -x "${OUTPUTDIR}/Ref/Bowtie2_Ref" \
        -1 "${OUTPUTDIR}/Clean/Clean_R1.fastq.gz" \
        -2 "${OUTPUTDIR}/Clean/Clean_R2.fastq.gz" | samtools view -bS -f 0x2 - | samtools sort - > "${OUTPUTDIR}/Temp/Bowtie_Aligned.bam"

if [ $? -ne 0 ]; then
    echo "Error in step: Map Trimmed Reads. Exiting."
    exit 1
fi

# Index alignment file
echo
echo "running samtools"
echo
samtools index "${OUTPUTDIR}/Temp/Bowtie_Aligned.bam"

if [ $? -ne 0 ]; then
    echo "Error in step: Index alignment file. Exiting."
    exit 1
fi

# Use umi_tools to identify valid BC sequences (default edit distance = 1)
echo
echo "running umi_tools"
echo
umi_tools group --edit-distance-threshold=2 --paired -I "${OUTPUTDIR}/Temp/Bowtie_Aligned.bam" \
                --group-out="${OUTPUTDIR}/Temp/Frag_BC_Collapsed.tsv" \
                --log="${OUTPUTDIR}/Temp/Frag_BC_Collapsed.log"

if [ $? -ne 0 ]; then
    echo "Error in step: Use umi_tools to identify valid BC sequences. Exiting."
    exit 1
fi

# Tabulate All BCs in Association Library, sort by BC
awk 'BEGIN {OFS="\t"}; NR>1{print $7,$8}' "${OUTPUTDIR}/Temp/Frag_BC_Collapsed.tsv" | uniq | sort -k1,1 - > "${OUTPUTDIR}/Temp/Frag_BC_Collapsed_Count_Raw.txt"


# Add Library Normalized BC Counts to Frag Set
awk 'BEGIN {OFS="\t"}; NR>1{print $7,$1}' "${OUTPUTDIR}/Temp/Frag_BC_Collapsed.tsv" | sort -k1,1 -  > "${OUTPUTDIR}/Temp/Frag_BC_Collapsed_Reduced.txt"


# Convert mapped reads in bam file to paired-end bed file
echo
echo "running samtools"
echo
samtools sort -n "${OUTPUTDIR}/Temp/Bowtie_Aligned.bam" | bedtools bamtobed -i stdin -bedpe > "${OUTPUTDIR}/Temp/Bowtie_Aligned.bed"

if [ $? -ne 0 ]; then
    echo "Error in step: Convert mapped reads in bam file to paired-end bed file. Exiting."
    exit 1
fi

# Convert to Bed Format
awk 'BEGIN {OFS="\t"}; {print $1,$7,$2,$3,$5,$6}' "${OUTPUTDIR}/Temp/Bowtie_Aligned.bed" | awk  'BEGIN {OFS="\t"}; {a=0; b=0; for (i=1;i<=NF;i++) if ($i < a || i == 1)a = $i; else if($i > b|| i == 1)b = $i; print $2, $1, a, b}' - | sort -k1,1 - > "${OUTPUTDIR}/Temp/Bowtie_Aligned_Reduced.bed"


# Sort BC collapsed file
awk 'BEGIN {OFS="\t"}; NR>1{print $0}' "${OUTPUTDIR}/Temp/Frag_BC_Collapsed.tsv" | sort -k1,1 - > "${OUTPUTDIR}/Temp/Frag_BC_Collapsed_Sorted.tsv"


# Merge bed file w/ coordinates to corrected BC
echo
echo "joining"
echo
join -j 1 "${OUTPUTDIR}/Temp/Bowtie_Aligned_Reduced.bed" "${OUTPUTDIR}/Temp/Frag_BC_Collapsed_Sorted.tsv" | awk 'BEGIN {OFS="\t"}; {print $2,$3,$4,$10,$1}' - | sort -k1,1 -k2,2n -k3,3n -  > "${OUTPUTDIR}/Temp/Frag_BC_Corrected.txt"

#Check for contaminating barcodes
echo
echo "Searching for contaminating barcodes"
echo

awk -F'\t' 'BEGIN {OFS="\t"} {split($5, arr, "_"); $6 = arr[2]; print}' ${OUTPUTDIR}/Temp/Frag_BC_Corrected.txt > ${OUTPUTDIR}/Temp/Frag_BC_Corrected_Library.txt
cat ${OUTPUTDIR}/Temp/Frag_BC_Corrected_Library.txt | cut -f6 | sort | uniq -c > ${OUTPUTDIR}/Final/Library_BC_Counts.txt

#Remove Contaminating Reads from library
awk -F '\t' -v var="$LIBRARY_KEY" '$6 == var {print $1, $2, $3, $4, $5}' ${OUTPUTDIR}/Temp/Frag_BC_Corrected_Library.txt > ${OUTPUTDIR}/Temp/Frag_BC_Corrected.txt


# Use R script with GRanges to collapse by overlap and output resulting bed file.
echo
echo "running Rscript"
echo
module load gcc/9.2.0 R/4.2.1
Rscript --vanilla $Rcode "${OUTPUTDIR}/Temp"
module load gcc samtools blat conda3 bedtools

# Correct genomic coordinates due to subsampling (this will fail if there are multiple "mapped" sites from blat)
BASECORRECTION=`cat "${OUTPUTDIR}/Ref/Ref.bed" | awk '{print $2}'`
CHRCORRECTION=`cat "${OUTPUTDIR}/Ref/Ref.bed" | awk '{print $1}'`

awk -v var1=$BASECORRECTION -v var2=$CHRCORRECTION 'BEGIN {OFS="\t"}; {print $1,var2,$3+var1,$4+var1,$5}' "${OUTPUTDIR}/Temp/BC_Corrected_Collapse.txt" > "${OUTPUTDIR}/Final/BC_Corrected_Final_Collapse.txt"


# Sort by location 
awk 'BEGIN {OFS="\t"}; {print $2,$3,$4,$1,$5}' "${OUTPUTDIR}/Final/BC_Corrected_Final_Collapse.txt" | sort -k1,1 -k2,2n -k3,3n - > "${OUTPUTDIR}/Final/BC_Corrected_Collapse_Location_Sort.txt"


# Convert Base-corrected Unique BC/Fragment Pairings into bam file
cat "${OUTPUTDIR}/Final/BC_Corrected_Collapse_Location_Sort.txt" | awk 'BEGIN {OFS="\t"}; {print $1,$2,$3,"Read_"NR}' - | bedtools bedtobam -g $CHROM -i stdin | samtools sort - > "${OUTPUTDIR}/Final/Count_Unique.bam"


# Convert into Bam file after expanding bed entries to match read number (Convert the bedtools bedtobam genome file to variable #5 after this run is complete)
awk '{for(i=0;i<$5;i++) print}' "${OUTPUTDIR}/Final/BC_Corrected_Collapse_Location_Sort.txt" | awk 'BEGIN {OFS="\t"}; {print $1,$2,$3,"Read_"NR}' - | bedtools bedtobam -g $CHROM -i stdin | samtools sort - > "${OUTPUTDIR}/Final/Count_Expand.bam"

# Index corrected bam file
echo
echo "running samtools"
echo
samtools index "${OUTPUTDIR}/Final/Count_Expand.bam"
samtools index "${OUTPUTDIR}/Final/Count_Unique.bam"

if [ $? -ne 0 ]; then
    echo "Error in step: Index corrected bam file. Exiting."
    exit 1
fi

# Make Bigwig file of unique BC/Fragment pairings
echo
echo "running deeptools"
echo
module load gcc/9.2.0 deeptools
bamCoverage -p $CORES -bs 1 -b "${OUTPUTDIR}/Final/Count_Unique.bam" -o "${OUTPUTDIR}/Final/Frag_BC_Reformed.bw"


# Make Bigwig file of Read-Count expanded BC/Fragment pairings
bamCoverage -p $CORES -bs 1 -b "${OUTPUTDIR}/Final/Count_Expand.bam" -o "${OUTPUTDIR}/Final/Frag_BC_Norm.bw"

# Cleanup - optional
# rm -R "${OUTPUTDIR}/BC_Extract"
# rm -R "${OUTPUTDIR}/Temp"
# rm -R "${OUTPUTDIR}/Clean"

